Libraries

library(tidyverse)
library(sf)
library(raster)

Load Data

Background image

# Load the merged Satellite image for Kazanlak Valley, Bulgaria
# use cds-spatial repo from github https://github.com/CDS-AU-DK/cds-spatial
kaz <-brick("../1_Teaching/cds-spatial/data/Kaz.tif")
plotRGB(kaz, stretch= "lin")

crs(kaz)
## CRS arguments:
##  +proj=utm +zone=35 +datum=WGS84 +units=m +no_defs

Prediction data

# Load prediction data as points (left bottom corner of the evaluated cell)
cnne_df <- read_csv("2021-10-25.predictions/results/east/east.csv")
# 15334 points (origins in the raster cells)
cnnw_df <- read_csv("2021-10-25.predictions/results/west/west.csv")
# 15334 points (origins in the raster cells)
cnn_df <- rbind(cnne_df, cnnw_df)
# 30504 rows

# Check the probabilities of there being a mound in a given cell
hist(cnn_df$mound_probability,
        main = "Probability of mound present in a satellite image gridcell");abline(v= 0.8, col="red", lty = 2, lwd = 3)

table(cut(cnn_df$mound_probability, c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)))
## 
##   (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8] 
##      7865      7710      6983      4757      2243       653       195        58 
## (0.8,0.9]   (0.9,1] 
##        30        10
# ok, so there is an exponential drop-off. 

Make the predictions to sf grid

Let’s make the predictions to a grid so we can assess whether gridcells with higher probabilities of mound presence in fact contain burial mounds. To keep the grid lightweight, let’s start with those gridcells that have values of mound probability at 80% and higher.

### Build a grid of those with 80%+ probability of containing a mound

# Build a grid from all points
#cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")

# Filter predictions to those that have 80+% likelihood of containing a mound
cnn80_sp <- cnn_df %>% 
  filter(mound_probability >0.799) %>%  #333 observations
  st_as_sf(coords = c("x","y"), crs = 32635)

# Make a grid of 80%+ cells, 382 cells, 332 unique ones
cnn_grid80 <- st_make_grid(cnn80_sp, cellsize = 250, what = "polygons")
#plot(cnn_grid80)

# Add probability data to the 80%+ grid 
#cnnall_datagrid <- st_join(st_sf(cnnall_grid), cnnall_sp)
cnn_grid80 <- st_join(st_sf(cnn_grid80), cnn80_sp)

# Visualize the grid cells with higher probability 
ggplot(cnn_grid80) +
  geom_sf(aes(color = mound_probability))

# 333 raster cells are predicted to contain mounds with greater 
# than 80% likelihood. Archaeologists found 773 mounds in fraction of the area

# View the grid 
plotRGB(kaz, stretch= "lin");
plot(cnn_grid80$geometry, add = TRUE, border = "white")

Field data

that we will need later for validation

# Bring in all the mounds observed in the field
mounds <- st_read("../1_Teaching/cds-spatial/data/KAZ_mounds.shp")
## Reading layer `KAZ_mounds' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_mounds.shp' using driver `ESRI Shapefile'
## Simple feature collection with 773 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS:  WGS 84 / UTM zone 35N
plot(mounds$geometry, main = "Mounds found through survey")

# Bring in survey area to see overall coverage
survey <- st_read("../1_Teaching/cds-spatial/data/KAZ_surveyarea.shp")
## Reading layer `KAZ_surveyarea' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_surveyarea.shp' using driver `ESRI Shapefile'
## Simple feature collection with 98 features and 5 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 352181.7 ymin: 4712923 xmax: 368890.2 ymax: 4733359
## projected CRS:  WGS 84 / UTM zone 35N
plot(survey$geometry, main = "Area covered by survey")

# Extrapolate rough study area after subtracting outliers
far <- mounds %>% 
  st_is_within_distance(survey, 1500) %>% 
  lengths>0 
farmounds <- which(far==0)  # these are the mounds that are mostly far from survey area

# convex hull of points without outliers
survey_sm <- st_convex_hull(st_union(mounds$geometry[-farmounds]))

# convex hull of survey polygons 
survey_ch <- st_convex_hull(st_union(survey$geometry))

# quickly see the difference in bounding boxes
plot(survey_ch, col = "green", main = "Convex hull around survey area (tight and extensive)");plot(survey_sm, col = "red", alpha=0.5, add = TRUE); plot(survey$geometry, add =TRUE)

Plot the field data

# View all mounds
plotRGB(kaz, stretch= "lin");
plot(survey_ch, col = "green", add = TRUE);
plot(survey_sm, col = "red", add = TRUE);
plot(survey$geometry, col = "lightyellow", add = TRUE );
plot(mounds$geometry[-farmounds,], add = TRUE, col = "pink");
plot(cnn_grid80$geometry, add = TRUE, border = "white");
legend("bottom", legend = "Grid cells with 80%+ probability of mounds (white squares) overlayed on \nthe satellite image and survey study area outlines with mounds (pink circles)")

Validation

The Tundzha Regional Archaeological project surveyed 85 sq km of the area included in the satellite imagery in 2009-2011, documenting the burial mounds within which create a dataset suitable for validation.

Area surveyed contains 773 documented mounds of various shapes and sizes. This area also contains 192 of 332 grid cells of 250m a side which were allocated 0.6+ probability of containing a mound. Let’s explore how many mounds are outside these grid cells and which mounds were predicted and which not. Bring in mound attributes and check correlation. Does size play a role?

Bring in mound attributes

# Bring in mound attributes (n = 773) to aid exploration
mounddata <- read_csv("../1_Teaching/cds-spatial/data/KAZ_mdata.csv")
mounds <- mounds %>% 
  left_join(mounddata, by = c("TRAP_Code"="MoundID"))

Crop 80%+ cell grid to TRAP study area

Let’s see which of the grid cells with high probability of containing a mound are in the TRAP study area?

# Select gridcells that fall within TRAP study area
survey_grids80 <- st_intersection(survey_ch, cnn_grid80) # 40 grid cells have 80%+ likelihood of containing mound inside the study area
#plot(survey_grids80)

True Positives

Archaeological survey documented 773 mounds in the study area. Mounds from this dataset that fall in the 80%+ probability gridcells form the true positives and comprise the success of the CNN model.

# Check spatial overlap between 80%+ grid cells and all 773 mounds 
predicted80 <- st_intersection(mounds, cnn_grid80)# %>% distinct()

# Which ones are not-predicted? 
`%nin%` = Negate(`%in%`)
unpredicted80 <- mounds[mounds$TRAP_Code%nin%predicted80$TRAP_Code,] # 627 not predicted

paste0("there are ", length(unique(predicted80$TRAP_Code)), " unique mounds detected within the ",length(unique(cnn_grid80$X1)), " unique gridcells.")
## [1] "there are 25 unique mounds detected within the 40 unique gridcells."
head(predicted80,2) # predicted feature snippet
## Simple feature collection with 2 features and 11 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 360896.8 ymin: 4713055 xmax: 360896.8 ymax: 4713055
## projected CRS:  WGS 84 / UTM zone 35N
##      Notes TRAP_Code Cluster      Lat     Long Condition Robbed Height LandUse
## 82   Mound      4066       3 42.55737 25.30552         3      1      5 Pasture
## 82.1 Mound      4066       3 42.55737 25.30552         3      1      5 Pasture
##        X1 mound_probability                 geometry
## 82   4748         0.8855298 POINT (360896.8 4713055)
## 82.1 4749         0.8916270 POINT (360896.8 4713055)
grids_n_mounds <- predicted80 %>% 
#  st_drop_geometry() %>% 
  group_by(X1) %>% 
  count()

length(unique(grids_n_mounds$X1)) # all the detected mounds are withing 51 grids, out of which one is labelled NA, so 50 it is
## [1] 11
hist(grids_n_mounds$n, breaks = 40, 
     xlab = "Mound count in grid",
     ylab = "Grid count",
     main = paste0("Distribution of predicted mounds (n = ",length(unique(predicted80$TRAP_Code)), ") \n across ",nrow(grids_n_mounds)," TP grid cells among a total of  ",length(unique(cnn_grid80$X1))," with 80% likelihood"))

Update this: # Summary: 11 out of 40 grid cells (0.275) in the study area with 80%+ probability contain between 1 and 29 detected mounds. The high number makes sense because of the dense necropolis of little 10m diameter mounds in the NW of the valley.

The previous chunk shows that 25 of 773 unique mounds appear in 11 of the 40 (25%) “survey” grids that have 80% or higher probability of mounds and are at least partially within the TRAP study area. The intersection actually shows 34 intersecting mound points, but some fall on the border of two gridcells and are double-counted. The entire satellite image contains 40 grids with 80%+ probability of mounds, 40 fall in the TRAP study area and 11 of these contain 25 mounds (0.0323415% of total).

Add un/predicted column to mound data

# Add the information on prediction to mound dataset
mounds <- mounds %>% 
  mutate(predicted80 = 
           case_when(TRAP_Code %in% predicted80$TRAP_Code ~ "yes",                                TRAP_Code %nin% predicted80$TRAP_Code ~ "no"))

# Look at the predicted and unpredicted mounds 
plot(unpredicted80$geometry, col = "red", main = "Mounds predicted (green) and unpredicted (red) by CNN");plot(predicted80$geometry, col = "darkgreen", add= TRUE); 

Explore impact of height on detectability

Let’s look at whether mound Height is a factor impacting detectability. (it is a factor for visual inspection as higher mounds are more visible in sat img.)

# filter the mounds for largesh and noticeable ones (2+ meters)
large <- mounds %>% 
  filter(Height>=2) # 247 obs

mounds %>% 
  group_by(predicted80) %>% 
  summarize(min = min(Height), max = max(Height))
## Simple feature collection with 2 features and 3 fields
## geometry type:  MULTIPOINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS:  WGS 84 / UTM zone 35N
## # A tibble: 2 x 4
##   predicted80   min   max                                               geometry
##   <chr>       <dbl> <dbl>                                       <MULTIPOINT [m]>
## 1 no            0      20 ((352481.3 4724776), (352488.8 4724747), (352489.3 47~
## 2 yes           0.1     7 ((353488.9 4725971), (353521.2 4725932), (353665.3 47~
# Check the height difference between un- and predicted mounds
boxplot(Height~predicted80, data = mounds, xlab = "Was mound detected?")

# Check height distribution btw un- and predicted mounds
hist(mounds$Height[mounds$predicted80 == "no"], col = "pink", breaks  = 20,main = "Histogram of detected (yellow) and undetected (pink) mounds by height",
     xlab = "Height (m)");
hist(mounds$Height[mounds$predicted80 == "yes"], col = "yellow", add =TRUE, alpha = 0.5);

T-test on height

# Is there a significant relationship between the Height of a mound and its detection?
t.test(Height ~ as.factor(predicted80), data = mounds)
## 
##  Welch Two Sample t-test
## 
## data:  Height by as.factor(predicted80)
## t = -0.60744, df = 25.8, p-value = 0.5489
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  -1.1695773  0.6361549
## sample estimates:
##  mean in group no mean in group yes 
##          1.733289          2.000000
str(t.test(Height ~ as.factor(predicted80), data = mounds))
## List of 10
##  $ statistic  : Named num -0.607
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 25.8
##   ..- attr(*, "names")= chr "df"
##  $ p.value    : num 0.549
##  $ conf.int   : num [1:2] -1.17 0.636
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num [1:2] 1.73 2
##   ..- attr(*, "names")= chr [1:2] "mean in group no" "mean in group yes"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "difference in means between group no and group yes"
##  $ stderr     : num 0.439
##  $ alternative: chr "two.sided"
##  $ method     : chr "Welch Two Sample t-test"
##  $ data.name  : chr "Height by as.factor(predicted80)"
##  - attr(*, "class")= chr "htest"
# ... no, as the p at 0.5 is not significant

False Negatives

Given 25 mounds were predicted by CNN, where are the remaining 748? Let’s explore where the missing mounds are and what probability has been assigned to the cells that contain them by CNN model.

# Create a grid of ALL the predictions to see where the remaining 627 unpredicted mounds are 
cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#plot(cnnall_sp)
cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons") 
#plot(cnnall_grid) # we can see that the cells overlap in the middle

cnnall_grid <- st_join(st_sf(cnnall_grid), cnnall_sp) # add attributes

# Trying to eliminate overlap in gridcells
# eliminating id duplicates is nonsensical as the two datasets have same ids, I  must aggregate spatially, e.g. st_union or st_difference, or relabel the datasets before merging, or crop to a shared non-overlapping border.
missing80 <- st_intersection(unpredicted80, cnnall_grid) # 2158, clearly duplicates

# Check for duplicates
?distinct()
length(duplicated(missing80$TRAP_Code)) # 2158 points to duplication
## [1] 2606
missing80 <- missing80[!duplicated(missing80$TRAP_Code),] #620 undetected mounds

How many grids are the 741-8 missing mounds located within?

# How many grids are the missing mounds in?
missing80 %>% 
  st_drop_geometry() %>% 
  group_by(X1) %>% 
  tally() %>% arrange(desc(n))
## # A tibble: 279 x 2
##       X1     n
##    <dbl> <int>
##  1  7551    28
##  2  7921    27
##  3  9209    15
##  4  7366    13
##  5  8844    13
##  6  8292    12
##  7  8294    12
##  8  9766    12
##  9  7367    11
## 10  7552    11
## # ... with 269 more rows
# The missing 740 mounds are contained within 279 grid cells of the satellite image 

# Let's see the distribution of missing mounds per grid cell
missing80 %>% 
  group_by(X1) %>% 
  count() %>% 
  ggplot()+ geom_histogram(aes(n))+ggtitle("Count of undetected mounds (748) per gridcell(n=259)") + theme_bw()

# some gridcells contain up to 28 undetected mounds. No surprise with the NW necropolis

Undetected mounds appear in 259 gridcells, which contain between 1 and 29 mounds, heavily skewed to the left.

What is the probability of the cells that contain undetected mounds?

# What is the mound_probability in these gridcells?
hist(missing80$mound_probability, 
     xlab = "Cnn-generated probability of mound",
     main = "Probability assigned to gridcells containing undetected mounds")

Somehow 5 of the undetected mounds still have high mound-probability!? How have these snuck through? Maybe they are outside the study area?

Discrepancy in gridcell/mound probability

Probability in five undetected mounds is showing to be over 0.59 when it should be below! Let’s visually inspect what is going on and investigate:

# Ggplot of mounds with high prob and grid with high prob
# missing %>% 
#   filter(mound_probability>0.59) %>% 
#   ggplot()+
#   geom_sf(aes(color = mound_probability))+
#   geom_sf(data = cnn_grid80, alpha = 0.5)

library(mapview)
missing80 %>% 
  filter(mound_probability>0.59) %>% 
  mapview(color = "red")+
 # mapview(cnn_grid80, alpha = 0.5)+
  mapview(cnnall_grid, zcol= "mound_probability")

NO undetected mounds fall in cells with high probability(80%+). Here I display the undetectd mounds in cells with 59%+ probability

Ideally, I should be probably running the validation separately for Kaz-East and Kaz-West images as that is how the CNN values have been generated. However, it is interesting that the model generates different values for the same locations.

Most of the visualisation below is just a rough guide as the overlap in grids needs to be sorted.

Visualize results

# See the predicted mounds over grids with 80% likelihood of mound plus unpredicted mounds

library(mapview)
mapview(unpredicted80, color = "orange", alpha= 0.5,
         map.types = c("Esri.WorldImagery", "CartoDB.Positron"),
         layer.name = "Undetected mounds")+
  mapview(grids_n_mounds, zcol = "n", at = c(1, 2, 5, 10, 15, 30),
          layer.name = "Detected mound no per gridcell")+
  mapview(survey_grids80, layer.name = "Gridcells with 80%+ probability")

Summary of success rates

146 mounds was detected out of 773, making the CNN predict (over 80% probability) ca 20% of mounds documented in the study area. Out of the 192 gridcells which are within the archaeological study area and have 80%+ probabilty of containing a mound, 50 actually contained between 1 and 29 mounds. Smaller mounds seem to enjoy higher detection rate according to the t-test, but that may not be the right metric. Let’s say that surprisingly, the large mounds are not detected as frequently as I would expect, and as human would detect them.

The remaining undetected 620 mounds were found in 259 gridcells, clustered from 1 to 28 specimen. Their exploration shows a problem with underlying grid overlap which cause reduplication of probability values (which are not identical) and proliferation of mound duplicates which each have different probability value! See section discrepancy above.

Once overlap is resolved, I can slide the probability downwards and downwards and see if I get higher veracity (more cells will be TPs) with higher probability or decrease missing values (fewer FNs) with lower probability cells.

What is the model succeeding at and what not?

Look at the NW cluster and see that many mounds sit on the border of polygons, and so get counted twice > ca 20 mounds, so the numbers of predicted are slightly exaggerated In NE we are missing a lot of very large 20m diameter+ mounds that are covered with scrub (look forested) In the south a lot of forest patches and beach boundaries are getting picked up that are false positives. Here the overlap may be at play !! REVISIT. All in all the model seems to pick on the bright signatures rather than round shapes and there is an odd combination of large forested lozenges and small bare spots picked up, but somehow round forested images with little excavated (bright) trenches on the south side is not getting picked up.

Additional visualisations

# Where are the mounds vis-a-vis the probability values
ggplot()+
  geom_sf(data = mounds, alpha = 0.6) +
  geom_sf(data = cnn_grid80, aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  
  ggtitle("Location of known mounds vis-a-vis the CNN predictions")

# See the 34/25 mound features predicted with 80%+ probability 
cnn_grid80 %>% 
 # filter(mound_probability>0.5) %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  geom_sf(data = mounds$geometry[mounds$predicted80 == "yes"], size = mounds$Height[mounds$predicted80 == "yes"], col = "lightgrey", alpha = 0.5) +
  ggtitle("Kazanlak mounds predicted with 80% likelihood threshold")

# See the 748 mounds not caught by the cells with 80%+ probability 
cnn_grid80 %>% 
  # filter(mound_probability>0.5) %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  #geom_sf(data = unpredicted$geometry)+
  geom_sf(data = mounds$geometry[mounds$predicted80 == "no"], size = mounds$Height[mounds$predicted80 == "no"], col = "lightgrey", alpha = 0.5)+
  ggtitle("Kazanlak mounds unpredicted at 80% threshold")

Side-by-side views

These images take mound height into account and split the results into facets

# Cumulative facetted visual

ggplot()+
  geom_sf(data = cnn_grid80, aes(fill = mound_probability)) +
  labs(fill = "Probability of mound")+
  # add mounds
  geom_sf(data = mounds, 
          aes(size = Height, col = "red",  alpha = 0.6)) +
  facet_wrap(~predicted80)+
  theme_bw() + 
  guides(color = FALSE,
         size = guide_legend(order = 1),
         fill = guide_legend(order = 2),
         alpha = FALSE)+
  ggtitle("Mounds detected at 80%+ probability threshold (no=748, yes=24)")

par(mfrow = c(1,2))

# Look at the predicted and un=predicted mounds 
plotRGB(kaz, stretch= "lin");
legend("top", legend = "Detection differences by mound height")
plot(mounds$geometry[mounds$predicted80 == "no"], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted80 == "yes"], add = TRUE, col = "green")
plot(cnn_grid80$geometry, add = TRUE, border = "white")
legend(x = "bottom",          # Position
       legend = c("undetected", "detected",
                  "grids with 80%+ mound probability"),  # Legend texts
       pch = c(1,1,0),       # Symbol shapes
       col = c("red","green", "lightgrey"))           # Line colors
     

# View Large mounds only
plotRGB(kaz, stretch= "lin");
plot(mounds$geometry[mounds$predicted80 == "no" & mounds$Height >=2], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted80 == "yes"& mounds$Height >=2], add = TRUE, col = "green")
plot(cnn_grid80$geometry, add = TRUE, border = "white")
legend(x = "bottom",          # Position
       legend = c("undetected 2m+ mounds", "detected 2m+ mounds",
                  "grids with 80%+ mound probability"),  # Legend texts
       pch = c(1,1,0),       # Symbol shapes
       col = c("red","green", "lightgrey"))           # Line colors